<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ArXiv CS.CV Papers (Image/Video Generation) - May 05, 2025</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/framer-motion/10.16.4/framer-motion.dev.js"></script>
    <!-- Example using Font Awesome (replace with your preferred icon library if needed) -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

        :root {
            /* New Palette: Light, Clean, Futuristic with Teal/Aqua accents */
            --bg-color: #f8fafc; /* Tailwind slate-50 (Very Light Gray) */
            --card-bg-color: #ffffff; /* White */
            --text-color: #1e293b; /* Tailwind slate-800 (Dark Gray-Blue) */
            --text-muted-color: #64748b; /* Tailwind slate-500 (Medium Gray-Blue) */
            --header-color: #0f172a; /* Tailwind slate-900 (Very Dark Blue) */
            --highlight-primary: #14b8a6; /* Tailwind teal-500 */
            --highlight-secondary: #67e8f9; /* Tailwind cyan-300 */
            --border-color: #e2e8f0; /* Tailwind slate-200 (Light Gray) */
            --shadow-color: rgba(15, 23, 42, 0.08); /* Subtle shadow based on slate-900 */
        }

        body {
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            overflow-x: hidden; /* Prevent horizontal scroll */
            line-height: 1.6;
        }

        .bento-grid {
            display: grid;
            gap: 1.5rem; /* Tailwind gap-6 */
            grid-template-columns: 1fr; /* Force single column */
            padding-bottom: 4rem; /* Add padding at the bottom */
        }

        .bento-item {
            /* Apply semi-transparent white background and blur */
            background-color: rgba(255, 255, 255, 0.7); /* White with 70% opacity */
            backdrop-filter: blur(10px); /* Apply blur effect */
            -webkit-backdrop-filter: blur(10px); /* Safari prefix */
            border-radius: 1rem; /* Slightly larger radius */
            padding: 1.75rem; /* Slightly more padding */
            border: 1px solid rgba(226, 232, 240, 0.5); /* Lighter border with transparency */
            box-shadow: 0 4px 12px var(--shadow-color);
            transition: transform 0.3s ease-out, box-shadow 0.3s ease-out, background-color 0.3s ease-out;
            overflow: hidden; /* Ensure content doesn't overflow */
            position: relative; /* For potential pseudo-elements */
        }

        /* Removed ::before pseudo-element for a cleaner look */


        .bento-item:hover {
            transform: translateY(-6px);
            box-shadow: 0 10px 20px var(--shadow-color), 0 4px 8px rgba(15, 23, 42, 0.06); /* Adjusted hover shadow */
        }

        .paper-title {
            font-size: 1.125rem; /* Tailwind text-lg */
            font-weight: 600; /* Tailwind font-semibold */
            color: var(--highlight-primary); /* Use new primary highlight */
            margin-bottom: 0.75rem; /* Tailwind mb-3 */
            line-height: 1.4;
        }

        .paper-summary {
            font-size: 0.875rem; /* Tailwind text-sm */
            color: var(--text-muted-color);
            margin-bottom: 1.25rem; /* Tailwind mb-5 */
            line-height: 1.6;
        }

        .paper-link {
            display: inline-flex; /* Use flex for icon alignment */
            align-items: center;
            font-size: 0.875rem; /* Tailwind text-sm */
            font-weight: 600;
            color: var(--highlight-primary);
            text-decoration: none;
            padding: 0.5rem 1rem; /* Add padding */
            border-radius: 0.5rem; /* Slightly rounder */
            background-color: rgba(20, 184, 166, 0.08); /* Subtle teal background */
            border: 1px solid rgba(20, 184, 166, 0.2);
            transition: background-color 0.3s ease, color 0.3s ease, transform 0.2s ease;
        }

        .paper-link i {
            margin-right: 0.5rem; /* Tailwind mr-2 */
            transition: transform 0.3s ease;
        }

        .paper-link:hover {
            background-color: rgba(20, 184, 166, 0.15);
            color: #0d9488; /* Darker teal on hover */
            transform: translateY(-1px);
        }
        .paper-link:hover i {
             transform: translateX(2px);
        }

        .paper-authors {
            font-size: 0.75rem; /* Tailwind text-xs */
            color: var(--text-muted-color);
            margin-top: 1rem; /* Tailwind mt-4 */
            font-style: italic;
        }

        .header {
            text-align: center;
            margin-bottom: 3rem; /* Tailwind mb-12 */
            padding-top: 3rem; /* Tailwind pt-12 */
        }

        .header h1 {
            font-size: 2.5rem; /* Tailwind text-4xl or 5xl */
            font-weight: 700; /* Tailwind font-bold */
            color: var(--header-color);
            letter-spacing: -0.025em; /* Tailwind tracking-tight */
            margin-bottom: 0.5rem;
            /* Optional: Add a subtle text gradient */
            /* background: linear-gradient(90deg, var(--highlight-primary), var(--highlight-secondary)); */
            /* -webkit-background-clip: text; */
            /* -webkit-text-fill-color: transparent; */
        }

        .header p {
            font-size: 1.125rem; /* Tailwind text-lg */
            color: var(--text-muted-color);
            margin-top: 0.5rem; /* Tailwind mt-2 */
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
        }

        .footer {
            text-align: center;
            color: var(--text-muted-color);
            font-size: 0.875rem; /* Tailwind text-sm */
            padding-top: 2rem;
            padding-bottom: 2rem; /* Tailwind py-8 */
            border-top: 1px solid var(--border-color);
            margin-top: 4rem;
        }

        /* Simple line graphic element (optional) */
        .line-graphic {
            height: 1px; /* Thinner line */
            background: linear-gradient(90deg, rgba(20, 184, 166, 0), var(--highlight-primary), rgba(20, 184, 166, 0));
            opacity: 0.6;
            margin: 1.5rem 0; /* Adjust margin */
        }

        /* Framer Motion requires the script, styles enhance appearance */
        [data-motion-element] {
             /* Base styles for elements animated by Framer Motion */
        }

        .paper-tldr {
            font-size: 0.95rem; /* Slightly bigger than summary */
            color: #475569; /* Changed to Tailwind slate-600 (slightly darker than summary) */
            margin-top: 0.75rem; /* Tailwind mt-3 */
            margin-bottom: 0.75rem; /* Tailwind mb-2 */
            /* font-style: italic; */
            font-weight: bold;
        }

        .paper-rating {
            margin-top: 1rem; /* Tailwind mt-4 */
            margin-bottom: 1rem; /* Tailwind mb-4 */
            color: #f59e0b; /* Tailwind amber-500 */
        }

        .paper-rating i {
            margin-right: 0.125rem; /* Tailwind mr-0.5 */
        }

        /* Apply consistent star color to sub-ratings */
        .paper-sub-ratings .rating-item i {
            color: #f59e0b; /* Match overall rating star color (amber-500) */
            margin-right: 0.125rem; /* Consistent spacing */
        }

    </style>
</head>
<body class="container mx-auto px-4 antialiased">

    <motion.div
        initial="{ opacity: 0, y: -30 }"
        animate="{ opacity: 1, y: 0 }"
        transition="{ duration: 0.6, ease: 'easeOut' }"
        class="header"
        data-motion-element
    >
        <h1>AIGC Daily Papers</h1>
        <p>Daily papers related to Image/Video/Multimodal Generation from cs.CV</p>
        <p>May 05, 2025</p>
        <div class="line-graphic mt-4 mb-8 mx-auto w-1/4"></div> <!-- Added line graphic -->
    </motion.div>

    <div class="bento-grid" id="paper-grid">
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">DualReal: Adaptive Joint Training for Lossless Identity-Motion Fusion in Video Customization</h2>
            <p class="paper-summary">Customized text-to-video generation with pre-trained large-scale models has
recently garnered significant attention through focusing on identity and motion
consistency. Existing works typically follow the isolated customized paradigm,
where the subject identity or motion dynamics are customized exclusively.
However, this paradigm completely ignores the intrinsic mutual constraints and
synergistic interdependencies between identity and motion, resulting in
identity-motion conflicts throughout the generation process that systematically
degrades. To address this, we introduce DualReal, a novel framework that,
employs adaptive joint training to collaboratively construct interdependencies
between dimensions. Specifically, DualReal is composed of two units: (1)
Dual-aware Adaptation dynamically selects a training phase (i.e., identity or
motion), learns the current information guided by the frozen dimension prior,
and employs a regularization strategy to avoid knowledge leakage; (2)
StageBlender Controller leverages the denoising stages and Diffusion
Transformer depths to guide different dimensions with adaptive granularity,
avoiding conflicts at various stages and ultimately achieving lossless fusion
of identity and motion patterns. We constructed a more comprehensive benchmark
than existing methods. The experimental results show that DualReal improves
CLIP-I and DINO-I metrics by 21.7% and 31.8% on average, and achieves top
performance on nearly all motion quality metrics.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces dualreal, a framework that addresses identity-motion conflicts in customized text-to-video generation by adaptively and jointly training identity and motion, achieving improved performance on identity and motion quality metrics.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了dualreal，一个通过自适应地联合训练身份和运动来解决定制文本到视频生成中身份-运动冲突的框架，并在身份和运动质量指标上实现了性能的提升。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02192v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Wenchuan Wang, Mengqi Huang, Yijing Tu, Zhendong Mao</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Regression s all you need for medical image translation</h2>
            <p class="paper-summary">The acquisition of information-rich images within a limited time budget is
crucial in medical imaging. Medical image translation (MIT) can help enhance
and supplement existing datasets by generating synthetic images from acquired
data. While Generative Adversarial Nets (GANs) and Diffusion Models (DMs) have
achieved remarkable success in natural image generation, their benefits -
creativity and image realism - do not necessarily transfer to medical
applications where highly accurate anatomical information is required. In fact,
the imitation of acquisition noise or content hallucination hinder clinical
utility. Here, we introduce YODA (You Only Denoise once - or Average), a novel
2.5D diffusion-based framework for volumetric MIT. YODA unites diffusion and
regression paradigms to produce realistic or noise-free outputs. Furthermore,
we propose Expectation-Approximation (ExpA) DM sampling, which draws
inspiration from MRI signal averaging. ExpA-sampling suppresses generated noise
and, thus, eliminates noise from biasing the evaluation of image quality.
Through extensive experiments on four diverse multi-modal datasets - comprising
multi-contrast brain MRI and pelvic MRI-CT - we show that diffusion and
regression sampling yield similar results in practice. As such, the
computational overhead of diffusion sampling does not provide systematic
benefits in medical information translation. Building on these insights, we
demonstrate that YODA outperforms several state-of-the-art GAN and DM methods.
Notably, YODA-generated images are shown to be interchangeable with, or even
superior to, physical acquisitions for several downstream tasks. Our findings
challenge the presumed advantages of DMs in MIT and pave the way for the
practical application of MIT in medical imaging.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces yoda, a 2.5d diffusion-based framework for medical image translation, showing that a regression approach can outperform more complex diffusion models and gans while also being computationally efficient.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种名为yoda的2.5d扩散医学图像转换框架。研究表明，回归方法能够胜过更复杂的扩散模型和gan，同时兼顾计算效率。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02048v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Sebastian Rassmann, David Kügler, Christian Ewert, Martin Reuter</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">GenSync: A Generalized Talking Head Framework for Audio-driven Multi-Subject Lip-Sync using 3D Gaussian Splatting</h2>
            <p class="paper-summary">We introduce GenSync, a novel framework for multi-identity lip-synced video
synthesis using 3D Gaussian Splatting. Unlike most existing 3D methods that
require training a new model for each identity , GenSync learns a unified
network that synthesizes lip-synced videos for multiple speakers. By
incorporating a Disentanglement Module, our approach separates
identity-specific features from audio representations, enabling efficient
multi-identity video synthesis. This design reduces computational overhead and
achieves 6.8x faster training compared to state-of-the-art models, while
maintaining high lip-sync accuracy and visual quality.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: gensync is a new framework using 3d gaussian splatting for multi-identity lip-synced video synthesis, offering faster training and competitive quality compared to single-identity models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: gensync是一种新的框架，使用3d高斯溅射进行多身份唇形同步视频合成，与单身份模型相比，它提供了更快的训练速度和具有竞争力的质量。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01928v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Anushka Agarwal, Muhammad Yusuf Hassan, Talha Chafekar</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.15000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">SignSplat: Rendering Sign Language via Gaussian Splatting</h2>
            <p class="paper-summary">State-of-the-art approaches for conditional human body rendering via Gaussian
splatting typically focus on simple body motions captured from many views. This
is often in the context of dancing or walking. However, for more complex use
cases, such as sign language, we care less about large body motion and more
about subtle and complex motions of the hands and face. The problems of
building high fidelity models are compounded by the complexity of capturing
multi-view data of sign. The solution is to make better use of sequence data,
ensuring that we can overcome the limited information from only a few views by
exploiting temporal variability. Nevertheless, learning from sequence-level
data requires extremely accurate and consistent model fitting to ensure that
appearance is consistent across complex motions. We focus on how to achieve
this, constraining mesh parameters to build an accurate Gaussian splatting
framework from few views capable of modelling subtle human motion. We leverage
regularization techniques on the Gaussian parameters to mitigate overfitting
and rendering artifacts. Additionally, we propose a new adaptive control method
to densify Gaussians and prune splat points on the mesh surface. To demonstrate
the accuracy of our approach, we render novel sequences of sign language video,
building on neural machine translation approaches to sign stitching. On
benchmark datasets, our approach achieves state-of-the-art performance; and on
highly articulated and complex sign language motion, we significantly
outperform competing approaches.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper presents signsplat, a gaussian splatting framework for rendering sign language from few views, focusing on subtle hand and face movements by using sequence data and regularization techniques. it achieves state-of-the-art results on sign language datasets.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出signsplat，一个高斯溅射框架，用于从少数视角渲染手语，侧重于细微的手部和面部动作，通过使用序列数据和正则化技术来实现。 在手语数据集上实现了最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02108v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Maksym Ivashechkin, Oscar Mendez, Richard Bowden</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.2, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">HandOcc: NeRF-based Hand Rendering with Occupancy Networks</h2>
            <p class="paper-summary">We propose HandOcc, a novel framework for hand rendering based upon
occupancy. Popular rendering methods such as NeRF are often combined with
parametric meshes to provide deformable hand models. However, in doing so, such
approaches present a trade-off between the fidelity of the mesh and the
complexity and dimensionality of the parametric model. The simplicity of
parametric mesh structures is appealing, but the underlying issue is that it
binds methods to mesh initialization, making it unable to generalize to objects
where a parametric model does not exist. It also means that estimation is tied
to mesh resolution and the accuracy of mesh fitting. This paper presents a
pipeline for meshless 3D rendering, which we apply to the hands. By providing
only a 3D skeleton, the desired appearance is extracted via a convolutional
model. We do this by exploiting a NeRF renderer conditioned upon an
occupancy-based representation. The approach uses the hand occupancy to resolve
hand-to-hand interactions further improving results, allowing fast rendering,
and excellent hand appearance transfer. On the benchmark InterHand2.6M dataset,
we achieved state-of-the-art results.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: handocc proposes a novel meshless hand rendering pipeline using nerf conditioned on an occupancy-based representation, achieving state-of-the-art results on the interhand2.6m dataset.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: handocc 提出了一种新颖的无网格手部渲染流程，该流程使用以 occupancy 为基础表示的 nerf，并在 interhand2.6m 数据集上取得了最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02079v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Maksym Ivashechkin, Oscar Mendez, Richard Bowden</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">OT-Talk: Animating 3D Talking Head with Optimal Transportation</h2>
            <p class="paper-summary">Animating 3D head meshes using audio inputs has significant applications in
AR/VR, gaming, and entertainment through 3D avatars. However, bridging the
modality gap between speech signals and facial dynamics remains a challenge,
often resulting in incorrect lip syncing and unnatural facial movements. To
address this, we propose OT-Talk, the first approach to leverage optimal
transportation to optimize the learning model in talking head animation.
Building on existing learning frameworks, we utilize a pre-trained Hubert model
to extract audio features and a transformer model to process temporal
sequences. Unlike previous methods that focus solely on vertex coordinates or
displacements, we introduce Chebyshev Graph Convolution to extract geometric
features from triangulated meshes. To measure mesh dissimilarities, we go
beyond traditional mesh reconstruction errors and velocity differences between
adjacent frames. Instead, we represent meshes as probability measures and
approximate their surfaces. This allows us to leverage the sliced Wasserstein
distance for modeling mesh variations. This approach facilitates the learning
of smooth and accurate facial motions, resulting in coherent and natural facial
animations. Our experiments on two public audio-mesh datasets demonstrate that
our method outperforms state-of-the-art techniques both quantitatively and
qualitatively in terms of mesh reconstruction accuracy and temporal alignment.
In addition, we conducted a user perception study with 20 volunteers to further
assess the effectiveness of our approach.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces ot-talk, a novel approach using optimal transportation and geometric feature extraction to improve the accuracy and naturalness of 3d talking head animation from audio inputs, demonstrating superior performance compared to state-of-the-art methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 ot-talk，一种利用最优传输和几何特征提取的新方法，旨在提高从音频输入生成 3d 说话头动画的准确性和自然性，实验证明其性能优于现有技术。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01932v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Xinmu Wang, Xiang Gao, Xiyun Song, Heather Yu, Zongfang Lin, Liang Peng, Xianfeng Gu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.30000000000000004, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Small Clips, Big Gains: Learning Long-Range Refocused Temporal Information for Video Super-Resolution</h2>
            <p class="paper-summary">Video super-resolution (VSR) can achieve better performance compared to
single image super-resolution by additionally leveraging temporal information.
In particular, the recurrent-based VSR model exploits long-range temporal
information during inference and achieves superior detail restoration. However,
effectively learning these long-term dependencies within long videos remains a
key challenge. To address this, we propose LRTI-VSR, a novel training framework
for recurrent VSR that efficiently leverages Long-Range Refocused Temporal
Information. Our framework includes a generic training strategy that utilizes
temporal propagation features from long video clips while training on shorter
video clips. Additionally, we introduce a refocused intra&inter-frame
transformer block which allows the VSR model to selectively prioritize useful
temporal information through its attention module while further improving
inter-frame information utilization in the FFN module. We evaluate LRTI-VSR on
both CNN and transformer-based VSR architectures, conducting extensive ablation
studies to validate the contribution of each component. Experiments on
long-video test sets demonstrate that LRTI-VSR achieves state-of-the-art
performance while maintaining training and computational efficiency.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces lrti-vsr, a novel training framework for recurrent video super-resolution that efficiently leverages long-range temporal information by training on short clips while utilizing temporal propagation features from longer clips and a refocused transformer block, achieving sota results.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种名为lrti-vsr的循环视频超分辨率训练框架，通过在短视频片段上训练，同时利用来自长视频片段的时间传播特征和一个重新聚焦的transformer模块，有效地利用了长距离的时间信息，并实现了最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(4/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02159v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Xingyu Zhou, Wei Long, Jingbo Lu, Shiyin Jiang, Weiyi You, Haifeng Wu, Shuhang Gu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.35000000000000003, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Hierarchical Compact Clustering Attention (COCA) for Unsupervised Object-Centric Learning</h2>
            <p class="paper-summary">We propose the Compact Clustering Attention (COCA) layer, an effective
building block that introduces a hierarchical strategy for object-centric
representation learning, while solving the unsupervised object discovery task
on single images. COCA is an attention-based clustering module capable of
extracting object-centric representations from multi-object scenes, when
cascaded into a bottom-up hierarchical network architecture, referred to as
COCA-Net. At its core, COCA utilizes a novel clustering algorithm that
leverages the physical concept of compactness, to highlight distinct object
centroids in a scene, providing a spatial inductive bias. Thanks to this
strategy, COCA-Net generates high-quality segmentation masks on both the
decoder side and, notably, the encoder side of its pipeline. Additionally,
COCA-Net is not bound by a predetermined number of object masks that it
generates and handles the segmentation of background elements better than its
competitors. We demonstrate COCA-Net's segmentation performance on six widely
adopted datasets, achieving superior or competitive results against the
state-of-the-art models across nine different evaluation metrics.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces coca and coca-net, a hierarchical attention-based clustering approach for unsupervised object-centric representation learning and segmentation in single images, achieving competitive or superior results on several benchmark datasets.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了coca和coca-net，一种用于无监督的以物体为中心的表征学习和单张图像分割的层级式、基于注意力的聚类方法，并在多个基准数据集上取得了具有竞争力的或更优异的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02071v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Can Küçüksözen, Yücel Yemez</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.4, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">R-Bench: Graduate-level Multi-disciplinary Benchmarks for LLM & MLLM Complex Reasoning Evaluation</h2>
            <p class="paper-summary">Reasoning stands as a cornerstone of intelligence, enabling the synthesis of
existing knowledge to solve complex problems. Despite remarkable progress,
existing reasoning benchmarks often fail to rigorously evaluate the nuanced
reasoning capabilities required for complex, real-world problemsolving,
particularly in multi-disciplinary and multimodal contexts. In this paper, we
introduce a graduate-level, multi-disciplinary, EnglishChinese benchmark,
dubbed as Reasoning Bench (R-Bench), for assessing the reasoning capability of
both language and multimodal models. RBench spans 1,094 questions across 108
subjects for language model evaluation and 665 questions across 83 subjects for
multimodal model testing in both English and Chinese. These questions are
meticulously curated to ensure rigorous difficulty calibration, subject
balance, and crosslinguistic alignment, enabling the assessment to be an
Olympiad-level multi-disciplinary benchmark. We evaluate widely used models,
including OpenAI o1, GPT-4o, DeepSeek-R1, etc. Experimental results indicate
that advanced models perform poorly on complex reasoning, especially multimodal
reasoning. Even the top-performing model OpenAI o1 achieves only 53.2% accuracy
on our multimodal evaluation. Data and code are made publicly available at
here.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces r-bench, a new graduate-level, multi-disciplinary, english-chinese benchmark for evaluating the complex reasoning capabilities of llms and mllms, revealing the limitations of even state-of-the-art models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了r-bench，一个新的研究生水平、多学科、英汉双语的基准测试，用于评估llm和mllm的复杂推理能力，揭示了即使是最先进模型的局限性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02018v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Meng-Hao Guo, Jiajun Xu, Yi Zhang, Jiaxi Song, Haoyang Peng, Yi-Xuan Deng, Xinzhi Dong, Kiyohiro Nakayama, Zhengyang Geng, Chen Wang, Bolin Ni, Guo-Wei Yang, Yongming Rao, Houwen Peng, Han Hu, Gordon Wetzstein, Shi-min Hu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.45, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Learning Heterogeneous Mixture of Scene Experts for Large-scale Neural Radiance Fields</h2>
            <p class="paper-summary">Recent NeRF methods on large-scale scenes have underlined the importance of
scene decomposition for scalable NeRFs. Although achieving reasonable
scalability, there are several critical problems remaining unexplored, i.e.,
learnable decomposition, modeling scene heterogeneity, and modeling efficiency.
In this paper, we introduce Switch-NeRF++, a Heterogeneous Mixture of Hash
Experts (HMoHE) network that addresses these challenges within a unified
framework. It is a highly scalable NeRF that learns heterogeneous decomposition
and heterogeneous NeRFs efficiently for large-scale scenes in an end-to-end
manner. In our framework, a gating network learns to decomposes scenes and
allocates 3D points to specialized NeRF experts. This gating network is
co-optimized with the experts, by our proposed Sparsely Gated Mixture of
Experts (MoE) NeRF framework. We incorporate a hash-based gating network and
distinct heterogeneous hash experts. The hash-based gating efficiently learns
the decomposition of the large-scale scene. The distinct heterogeneous hash
experts consist of hash grids of different resolution ranges, enabling
effective learning of the heterogeneous representation of different scene
parts. These design choices make our framework an end-to-end and highly
scalable NeRF solution for real-world large-scale scene modeling to achieve
both quality and efficiency. We evaluate our accuracy and scalability on
existing large-scale NeRF datasets and a new dataset with very large-scale
scenes ($>6.5km^2$) from UrbanBIS. Extensive experiments demonstrate that our
approach can be easily scaled to various large-scale scenes and achieve
state-of-the-art scene rendering accuracy. Furthermore, our method exhibits
significant efficiency, with an 8x acceleration in training and a 16x
acceleration in rendering compared to Switch-NeRF. Codes will be released in
https://github.com/MiZhenxing/Switch-NeRF.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces switch-nerf++, a highly scalable nerf method for large-scale scene modeling that uses a heterogeneous mixture of hash experts to achieve state-of-the-art rendering accuracy with significant training and rendering efficiency improvements.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了switch-nerf++，一种可高度扩展的nerf方法，用于大规模场景建模。该方法使用异构哈希专家混合模型（heterogeneous mixture of hash experts），实现了最先进的渲染精度，并显著提高了训练和渲染效率。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02005v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zhenxing Mi, Ping Yin, Xue Xiao, Dan Xu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.5, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Visual Dominance and Emerging Multimodal Approaches in Distracted Driving Detection: A Review of Machine Learning Techniques</h2>
            <p class="paper-summary">Distracted driving continues to be a significant cause of road traffic
injuries and fatalities worldwide, even with advancements in driver monitoring
technologies. Recent developments in machine learning (ML) and deep learning
(DL) have primarily focused on visual data to detect distraction, often
neglecting the complex, multimodal nature of driver behavior. This systematic
review assesses 74 peer-reviewed studies from 2019 to 2024 that utilize ML/DL
techniques for distracted driving detection across visual, sensor-based,
multimodal, and emerging modalities. The review highlights a significant
prevalence of visual-only models, particularly convolutional neural networks
(CNNs) and temporal architectures, which achieve high accuracy but show limited
generalizability in real-world scenarios. Sensor-based and physiological models
provide complementary strengths by capturing internal states and vehicle
dynamics, while emerging techniques, such as auditory sensing and radio
frequency (RF) methods, offer privacy-aware alternatives. Multimodal
architecture consistently surpasses unimodal baselines, demonstrating enhanced
robustness, context awareness, and scalability by integrating diverse data
streams. These findings emphasize the need to move beyond visual-only
approaches and adopt multimodal systems that combine visual, physiological, and
vehicular cues while keeping in checking the need to balance computational
requirements. Future research should focus on developing lightweight,
deployable multimodal frameworks, incorporating personalized baselines, and
establishing cross-modality benchmarks to ensure real-world reliability in
advanced driver assistance systems (ADAS) and road safety interventions.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper reviews machine learning techniques for distracted driving detection, highlighting the prevalence of visual-only models and advocating for multimodal approaches that integrate visual, physiological, and vehicular cues for improved robustness and real-world applicability.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文回顾了用于检测驾驶分心的机器学习技术，强调了纯视觉模型的主导地位，并提倡采用集成视觉、生理和车辆线索的多模态方法，以提高鲁棒性和现实应用性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01973v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Anthony Dontoh, Stephanie Ivey, Logan Sirbaugh, Andrews Danyo, Armstrong Aboah</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.55, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">CMAWRNet: Multiple Adverse Weather Removal via a Unified Quaternion Neural Architecture</h2>
            <p class="paper-summary">Images used in real-world applications such as image or video retrieval,
outdoor surveillance, and autonomous driving suffer from poor weather
conditions. When designing robust computer vision systems, removing adverse
weather such as haze, rain, and snow is a significant problem. Recently,
deep-learning methods offered a solution for a single type of degradation.
Current state-of-the-art universal methods struggle with combinations of
degradations, such as haze and rain-streak. Few algorithms have been developed
that perform well when presented with images containing multiple adverse
weather conditions. This work focuses on developing an efficient solution for
multiple adverse weather removal using a unified quaternion neural architecture
called CMAWRNet. It is based on a novel texture-structure decomposition block,
a novel lightweight encoder-decoder quaternion transformer architecture, and an
attentive fusion block with low-light correction. We also introduce a
quaternion similarity loss function to preserve color information better. The
quantitative and qualitative evaluation of the current state-of-the-art
benchmarking datasets and real-world images shows the performance advantages of
the proposed CMAWRNet compared to other state-of-the-art weather removal
approaches dealing with multiple weather artifacts. Extensive computer
simulations validate that CMAWRNet improves the performance of downstream
applications such as object detection. This is the first time the decomposition
approach has been applied to the universal weather removal task.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces cmawrnet, a quaternion neural network architecture for removing multiple adverse weather conditions from images, outperforming existing methods and improving downstream object detection performance. it uses a texture-structure decomposition block, a lightweight encoder-decoder quaternion transformer architecture, and an attentive fusion block with low-light correction.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为 cmawrnet 的四元数神经网络架构，用于去除图像中的多种不利天气条件，性能优于现有方法，并提高了下游目标检测的性能。它使用纹理-结构分解块、轻量级编码器-解码器四元数 transformer 架构和带有弱光校正的注意力融合块。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.01882v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Vladimir Frants, Sos Agaian, Karen Panetta, Peter Huang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.6000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Local Herb Identification Using Transfer Learning: A CNN-Powered Mobile Application for Nepalese Flora</h2>
            <p class="paper-summary">Herb classification presents a critical challenge in botanical research,
particularly in regions with rich biodiversity such as Nepal. This study
introduces a novel deep learning approach for classifying 60 different herb
species using Convolutional Neural Networks (CNNs) and transfer learning
techniques. Using a manually curated dataset of 12,000 herb images, we
developed a robust machine learning model that addresses existing limitations
in herb recognition methodologies. Our research employed multiple model
architectures, including DenseNet121, 50-layer Residual Network (ResNet50),
16-layer Visual Geometry Group Network (VGG16), InceptionV3, EfficientNetV2,
and Vision Transformer (VIT), with DenseNet121 ultimately demonstrating
superior performance. Data augmentation and regularization techniques were
applied to mitigate overfitting and enhance the generalizability of the model.
This work advances herb classification techniques, preserving traditional
botanical knowledge and promoting sustainable herb utilization.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper presents a cnn-based mobile application for local herb identification in nepal, utilizing transfer learning and a curated dataset to classify 60 herb species, with densenet121 showing the best performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种基于cnn的手机应用程序，用于尼泊尔当地草药识别，利用迁移学习和精选数据集对60种草药进行分类，其中densenet121表现最佳。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2505.02147v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Prajwal Thapa, Mridul Sharma, Jinu Nyachhyon, Yagya Raj Pandeya</p>
            
        </motion.div>
        
    </div>

    <footer class="footer">
        Generated on 2025-05-07 04:30:02 UTC. Powered by <a href="https://github.com/onion-liu" target="_blank">onion-liu</a>.
    </footer>

</body>
</html>